import pandas as pd
import numpy as np
df1=pd.read_excel('非洲通讯产品销售数据(1).xlsx',sheetname='SalesData')
print(df1.head())
print(df1.shape)
#检验总数据
print(df1.info())
#检查每个列缺失值的占比
print(df1.apply(lambda x : sum(x.isnull())/len(x), axis=0))
#查看重复数据总和
print(df1.duplicated().sum())
#时间格式的转换,把不合法的转为null
df1.loc[:,'日期']=pd.to_datetime(df1.loc[:,'日期'].astype(str),format='%Y-%m-%d',errors='coerce')
print(df1.info())
#检查每个列缺失值的占比
print(df1.apply(lambda x : sum(x.isnull())/len(x), axis=0))
#查看异常值
#观察销售额是否超出合理范围
z_data=df1.copy()
cols=df1['销售额']
z_score=(cols-cols.mean())/cols.std()
z_data['销售额']=z_score.abs()>3.0
# print(z_data[z_data['销售额']==True])
# 检验销售额是否小于利润
print(df1[df1['销售额']<df1['利润']])
print(len(df1[df1['利润']==0]))
df1[df1['利润']==0]=np.nan
df1_new=df1.dropna()
df1_new.to_excel('SalesData处理版.xlsx',index=False)
df1_new=pd.read_excel('SalesData处理版.xlsx')
df1_new.loc[:,'日期']=pd.to_datetime(df1_new.loc[:,'日期'].astype(str),format='%Y-%m-%d')
#把年和季度隔开
df1_new['year']=df1_new['日期'].dt.year
df1_new['quarter']=df1_new['日期'].dt.quarter
#按地区分类
df1_new_area=df1_new.groupby(by=['地区','year','quarter'])['销售额','利润'].sum().reset_index()
print(df1_new_area)
# df1_new_area.to_excel('地区分类.xlsx',index=False)
#求国家分类销售额季度同比增长率
df1_new_country=df1_new.groupby(by=['国家','year','quarter'])['销售额','利润'].sum().reset_index()
country=df1_new['国家'].unique()
for i in country:
  df1_diff1=df1_new_country.loc[df1_new_country['国家']==i,'销售额'].diff(periods=1)
  df1_new_country.loc[df1_new_country['国家']==i,'销售额季同比']=df1_diff1/(df1_new_country.loc[df1_new_country['国家']==i,'销售额']-df1_diff1)
#求国家分类利润季度同比增长率
for i in country:
  df1_diff11=df1_new_country.loc[df1_new_country['国家']==i,'利润'].diff(periods=1)
  df1_new_country.loc[df1_new_country['国家']==i,'利润季同比']=df1_diff11/(df1_new_country.loc[df1_new_country['国家']==i,'利润']-df1_diff11)
print(df1_new_country)
df1_new_country.to_excel('国家分类季度同比增长率.xlsx',index=False)
#求国家分类销售额年增长率
df1_new_country2=df1_new.groupby(by=['国家','year'])['销售额','利润'].sum().reset_index()
country=df1_new['国家'].unique()
for i in country:
  df1_diff2=df1_new_country2.loc[df1_new_country2['国家']==i,'销售额'].diff(periods=1)
  df1_new_country2.loc[df1_new_country2['国家']==i,'销售额年增长率']=df1_diff2/(df1_new_country2.loc[df1_new_country2['国家']==i,'销售额']-df1_diff2)
# 求国家分类利润年增长率
  df1_diff22=df1_new_country2.loc[df1_new_country2['国家']==i,'利润'].diff(periods=1)
  df1_new_country2.loc[df1_new_country2['国家']==i,'利润年增长率']=df1_diff22/(df1_new_country2.loc[df1_new_country2['国家']==i,'利润']-df1_diff22)
print(df1_new_country2)
df1_new_country2.to_excel('国家分类年增长率.xlsx',index=False)
#求服务分类的销售额季度同比增长率
survice=df1_new['服务分类'].unique()
df1_new_survice=df1_new.groupby(by=['服务分类','year','quarter'])['销售额','利润'].sum().reset_index()
#求服务分类的利润季度同比增长率
for i in survice:
 df1_diff33=df1_new_survice.loc[df1_new_survice['服务分类']==i,'利润'].diff(periods=1)
 df1_new_survice.loc[df1_new_survice['服务分类']==i,'利润季同比']=df1_diff33/(df1_new_survice.loc[df1_new_survice['服务分类']==i,'利润']-df1_diff33)
print(df1_new_survice)
df1_new_survice.to_excel('服务分类季度增长率.xlsx',index=False)
#求服务分类的销售额年增长率
df1_new_survice2=df1_new.groupby(by=['服务分类','year'])['销售额','利润'].sum().reset_index()
for i in survice:
  df1_diff4=df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'销售额'].diff(periods=1)
  df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'销售额年增长率']=df1_diff4/(df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'销售额']-df1_diff4)
#求服务分类利润年增长率
for i in survice:
  df1_diff44=df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'利润'].diff(periods=1)
  df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'利润年增长率']=df1_diff44/(df1_new_survice2.loc[df1_new_survice2['服务分类']==i,'销售额']-df1_diff44)

print(df1_new_survice2)
df1_new_survice2.to_excel('服务分类年增长率.xlsx',index=False)
#提出2020年数据
df1_2020=df1_new[df1_new['year']==2020]
df1_2020_country=df1_2020.groupby(by='国家')['销售额','利润'].sum().reset_index()
cols=['国家','销售额','利润']
df1_2020_country.columns=cols
#给某一列的数据排序(降序)
df1_2020_country=df1_2020_country.sort_values(by='销售额',ascending=False)
df1_2020_country3=df1_2020_country.iloc[:3,:]
print(df1_2020_country3)
df1_2020_country3.to_excel('2020年度销售额前三名国家.xlsx',index=False)
print(df1_new_country2[(df1_new_country2['国家']=='Niger')|(df1_new_country2['国家']=='Mauritania')|(df1_new_country2['国家']=='Eritrea')])
df1_new_country2.to_excel('前三名国家的年增长率.xlsx',index=False)
#
#
